{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_iris\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import featuretools as ft"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.22.2.post1\n",
      "0.13.4\n"
     ]
    }
   ],
   "source": [
    "import sklearn\n",
    "print(sklearn.__version__)\n",
    "print(ft.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 单个数据表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = load_iris()\n",
    "X = dataset.data\n",
    "y = dataset.target\n",
    "iris_feature_names = dataset.feature_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sepal length (cm)</th>\n",
       "      <th>sepal width (cm)</th>\n",
       "      <th>petal length (cm)</th>\n",
       "      <th>petal width (cm)</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5.1</td>\n",
       "      <td>3.5</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>4.9</td>\n",
       "      <td>3.0</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>4.7</td>\n",
       "      <td>3.2</td>\n",
       "      <td>1.3</td>\n",
       "      <td>0.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4.6</td>\n",
       "      <td>3.1</td>\n",
       "      <td>1.5</td>\n",
       "      <td>0.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5.0</td>\n",
       "      <td>3.6</td>\n",
       "      <td>1.4</td>\n",
       "      <td>0.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)\n",
       "0                5.1               3.5                1.4               0.2\n",
       "1                4.9               3.0                1.4               0.2\n",
       "2                4.7               3.2                1.3               0.2\n",
       "3                4.6               3.1                1.5               0.2\n",
       "4                5.0               3.6                1.4               0.2"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame(X, columns=iris_feature_names)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Entityset: single_dataframe\n",
       "  Entities:\n",
       "    iris [Rows: 150, Columns: 5]\n",
       "  Relationships:\n",
       "    No relationships"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import featuretools as ft\n",
    "es = ft.EntitySet(id='single_dataframe')           # 实体集命名为single_dataframe\n",
    "# 支持的数据类型可以看https://docs.featuretools.com/en/stable/api_reference.html#variable-types\n",
    "#variable_types = {col:ft.variable_types.Numeric for col in df.columns}\n",
    "es.entity_from_dataframe(entity_id='iris',         # 增加一个数据框，命名为iris\n",
    "             dataframe=df,\n",
    "#              variable_types=variable_types,\n",
    "             index='index',\n",
    "             make_index=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## max_depth等于1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Built 34 features\n",
      "Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████\n"
     ]
    }
   ],
   "source": [
    "trans_primitives=['add_numeric', 'subtract_numeric', 'multiply_numeric', 'divide_numeric']  # 采用的特征基元，我们这里2列相加减乘除来生成\n",
    "# ft.list_primitives()  # 查看可使用的特征集元\n",
    "feature_matrix, feature_names = ft.dfs(entityset=es, \n",
    "     target_entity='iris', \n",
    "     max_depth=1,    # max_depth=1，只在原特征上进行操作，产生新特征\n",
    "     verbose=1,\n",
    "     trans_primitives=trans_primitives\n",
    ")\n",
    "# 不会同特征加减乘除，即没有a+a等情况\n",
    "# 加和乘的新特征数+原始特征数，feature_num*(feature_num-1)/2+feature_num，所以这里是4*3/2+4=10\n",
    "# 减和除的新特征数+原始特征数，feature_num*(feature_num-1)+feature_num，所以这里是4*3+4=16\n",
    "# 实际上应该是10*2+16*2-4*3=40，4*3减去重复的3原始特征3次\n",
    "# 这是因为0.13.4的featuretools默认减法满足交换律"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[<Feature: sepal length (cm)>,\n",
       " <Feature: sepal width (cm)>,\n",
       " <Feature: petal length (cm)>,\n",
       " <Feature: petal width (cm)>,\n",
       " <Feature: petal length (cm) + sepal length (cm)>,\n",
       " <Feature: sepal length (cm) + sepal width (cm)>,\n",
       " <Feature: petal width (cm) + sepal width (cm)>,\n",
       " <Feature: petal length (cm) + petal width (cm)>,\n",
       " <Feature: petal length (cm) + sepal width (cm)>,\n",
       " <Feature: petal width (cm) + sepal length (cm)>,\n",
       " <Feature: petal length (cm) - sepal length (cm)>,\n",
       " <Feature: sepal length (cm) - sepal width (cm)>,\n",
       " <Feature: petal width (cm) - sepal width (cm)>,\n",
       " <Feature: petal length (cm) - petal width (cm)>,\n",
       " <Feature: petal length (cm) - sepal width (cm)>,\n",
       " <Feature: petal width (cm) - sepal length (cm)>,\n",
       " <Feature: petal length (cm) * sepal length (cm)>,\n",
       " <Feature: sepal length (cm) * sepal width (cm)>,\n",
       " <Feature: petal width (cm) * sepal width (cm)>,\n",
       " <Feature: petal length (cm) * petal width (cm)>,\n",
       " <Feature: petal length (cm) * sepal width (cm)>,\n",
       " <Feature: petal width (cm) * sepal length (cm)>,\n",
       " <Feature: petal length (cm) / sepal length (cm)>,\n",
       " <Feature: sepal length (cm) / sepal width (cm)>,\n",
       " <Feature: sepal length (cm) / petal width (cm)>,\n",
       " <Feature: sepal width (cm) / petal width (cm)>,\n",
       " <Feature: petal width (cm) / sepal width (cm)>,\n",
       " <Feature: petal length (cm) / sepal width (cm)>,\n",
       " <Feature: petal length (cm) / petal width (cm)>,\n",
       " <Feature: sepal width (cm) / sepal length (cm)>,\n",
       " <Feature: sepal width (cm) / petal length (cm)>,\n",
       " <Feature: petal width (cm) / sepal length (cm)>,\n",
       " <Feature: petal width (cm) / petal length (cm)>,\n",
       " <Feature: sepal length (cm) / petal length (cm)>]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_names"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Built 40 features\n",
      "Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████\n"
     ]
    }
   ],
   "source": [
    "# 设置commutative=False，让减法不满足交换律\n",
    "trans_primitives=['add_numeric', ft.primitives.SubtractNumeric(commutative=False), 'multiply_numeric', 'divide_numeric']  # 采用的特征基元，我们这里2列相加减乘除来生成\n",
    "feature_matrix, feature_names = ft.dfs(entityset=es, \n",
    "     target_entity='iris', \n",
    "     max_depth=1,    # max_depth=1，只在原特征上进行操作，产生新特征\n",
    "     verbose=1,\n",
    "     trans_primitives=trans_primitives\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "生成的特征可能会出现 np.nan 或者 np.inf，表示 空值 或者 无穷大。如果原始特征中没有这样异常数据，前者可能由“0/0”造成，后者可能由“R/0”造成，R是实数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sepal length (cm)                        0\n",
       "sepal width (cm)                         0\n",
       "petal length (cm)                        0\n",
       "petal width (cm)                         0\n",
       "petal length (cm) + sepal length (cm)    0\n",
       "sepal length (cm) + sepal width (cm)     0\n",
       "petal width (cm) + sepal width (cm)      0\n",
       "petal length (cm) + petal width (cm)     0\n",
       "petal length (cm) + sepal width (cm)     0\n",
       "petal width (cm) + sepal length (cm)     0\n",
       "petal length (cm) - sepal length (cm)    0\n",
       "sepal length (cm) - sepal width (cm)     0\n",
       "sepal length (cm) - petal width (cm)     0\n",
       "sepal width (cm) - petal width (cm)      0\n",
       "petal width (cm) - sepal width (cm)      0\n",
       "petal length (cm) - sepal width (cm)     0\n",
       "petal length (cm) - petal width (cm)     0\n",
       "sepal width (cm) - sepal length (cm)     0\n",
       "sepal width (cm) - petal length (cm)     0\n",
       "petal width (cm) - sepal length (cm)     0\n",
       "petal width (cm) - petal length (cm)     0\n",
       "sepal length (cm) - petal length (cm)    0\n",
       "petal length (cm) * sepal length (cm)    0\n",
       "sepal length (cm) * sepal width (cm)     0\n",
       "petal width (cm) * sepal width (cm)      0\n",
       "petal length (cm) * petal width (cm)     0\n",
       "petal length (cm) * sepal width (cm)     0\n",
       "petal width (cm) * sepal length (cm)     0\n",
       "petal length (cm) / sepal length (cm)    0\n",
       "sepal length (cm) / sepal width (cm)     0\n",
       "sepal length (cm) / petal width (cm)     0\n",
       "sepal width (cm) / petal width (cm)      0\n",
       "petal width (cm) / sepal width (cm)      0\n",
       "petal length (cm) / sepal width (cm)     0\n",
       "petal length (cm) / petal width (cm)     0\n",
       "sepal width (cm) / sepal length (cm)     0\n",
       "sepal width (cm) / petal length (cm)     0\n",
       "petal width (cm) / sepal length (cm)     0\n",
       "petal width (cm) / petal length (cm)     0\n",
       "sepal length (cm) / petal length (cm)    0\n",
       "dtype: int64"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_matrix.replace([np.inf, -np.inf], np.nan)  # np.inf都用np.nan代替\n",
    "feature_matrix.isnull().sum()                      # 查看可能存在的缺失值情况"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## max_depth不为1\n",
    "注意基元的顺序带来的影响，这里以2个加减两个基元说明"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Built 100 features\n",
      "Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████\n"
     ]
    }
   ],
   "source": [
    "# 先乘再除\n",
    "feat_matrix, feat_names = ft.dfs(entityset=es, \n",
    "                     target_entity='iris', \n",
    "                     max_depth=2, \n",
    "                     verbose=1,\n",
    "                     trans_primitives=['multiply_numeric', 'divide_numeric'],\n",
    ")\n",
    "# 乘法基元处理后特征数（包含原特征）一共有4*3/2+4=10个\n",
    "# 除法基元会在乘法处理后的10个特征上，进行除法操作，所以这样会有10*9+10=100个特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Built 136 features\n",
      "Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████\n"
     ]
    }
   ],
   "source": [
    "# 先除再乘\n",
    "feat_matrix, feat_names = ft.dfs(entityset=es, \n",
    "                     target_entity='iris', \n",
    "                     max_depth=2, \n",
    "                     verbose=1,\n",
    "                     trans_primitives=['divide_numeric', 'multiply_numeric']\n",
    ")\n",
    "# 除法基元处理后特征数（包含原特征）一共有4*3+4=16个\n",
    "# 同样地，乘法在这16个特征上进行操作，会有16*15/2+16=136个特征"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 多个数据表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_1 = pd.DataFrame({'id':[0,1,2,3], 'a':[1,2,2,3], 'b':[2,4,4,5]})\n",
    "df_2 = pd.DataFrame({'id':[0,1,1,2,3], 'c':[1,3,3,2,5], 'd':[5,6,7,9,8]})\n",
    "\n",
    "es = ft.EntitySet(id='double_dataframe')\n",
    "es.entity_from_dataframe(entity_id='df_1',         # 增加一个数据框\n",
    "             dataframe=df_1,\n",
    "             index='id')\n",
    "es.entity_from_dataframe(entity_id='df_2',         # 增加一个数据框\n",
    "             dataframe=df_2,\n",
    "             index='index',\n",
    "             make_index=True)\n",
    "# 通过 id 关联 df_1 和 df_2 实体\n",
    "relation = ft.Relationship(es['df_1']['id'], es['df_2']['id'])\n",
    "es = es.add_relationship(relation)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Built 7 features\n",
      "Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████\n"
     ]
    }
   ],
   "source": [
    "trans_primitives=['add_numeric']\n",
    "agg_primitives=['sum', 'median']\n",
    "feature_matrix, feature_names = ft.dfs(entityset=es, \n",
    "                     target_entity='df_1', \n",
    "                     max_depth=1, \n",
    "                     verbose=1,\n",
    "                     agg_primitives=agg_primitives,\n",
    "                     trans_primitives=trans_primitives)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>SUM(df_2.c)</th>\n",
       "      <th>SUM(df_2.d)</th>\n",
       "      <th>MEDIAN(df_2.c)</th>\n",
       "      <th>MEDIAN(df_2.d)</th>\n",
       "      <th>a + b</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>5.0</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>13</td>\n",
       "      <td>3</td>\n",
       "      <td>6.5</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>9.0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>5</td>\n",
       "      <td>8.0</td>\n",
       "      <td>8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    a  b  SUM(df_2.c)  SUM(df_2.d)  MEDIAN(df_2.c)  MEDIAN(df_2.d)  a + b\n",
       "id                                                                       \n",
       "0   1  2            1            5               1             5.0      3\n",
       "1   2  4            6           13               3             6.5      6\n",
       "2   2  4            2            9               2             9.0      6\n",
       "3   3  5            5            8               5             8.0      8"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Built 23 features\n",
      "Elapsed: 00:00 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████\n"
     ]
    }
   ],
   "source": [
    "trans_primitives=['add_numeric']\n",
    "agg_primitives=['sum', 'median']\n",
    "feature_matrix, feature_names = ft.dfs(entityset=es, \n",
    "                     target_entity='df_1', \n",
    "                     max_depth=2, \n",
    "                     verbose=1,\n",
    "                     agg_primitives=agg_primitives,\n",
    "                     trans_primitives=trans_primitives)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>SUM(df_2.c)</th>\n",
       "      <th>SUM(df_2.d)</th>\n",
       "      <th>MEDIAN(df_2.c)</th>\n",
       "      <th>MEDIAN(df_2.d)</th>\n",
       "      <th>a + b</th>\n",
       "      <th>SUM(df_2.c + d)</th>\n",
       "      <th>MEDIAN(df_2.c + d)</th>\n",
       "      <th>MEDIAN(df_2.c) + MEDIAN(df_2.d)</th>\n",
       "      <th>...</th>\n",
       "      <th>MEDIAN(df_2.d) + SUM(df_2.c)</th>\n",
       "      <th>a + SUM(df_2.d)</th>\n",
       "      <th>a + SUM(df_2.c)</th>\n",
       "      <th>b + MEDIAN(df_2.d)</th>\n",
       "      <th>b + SUM(df_2.d)</th>\n",
       "      <th>b + MEDIAN(df_2.c)</th>\n",
       "      <th>b + SUM(df_2.c)</th>\n",
       "      <th>a + MEDIAN(df_2.c)</th>\n",
       "      <th>a + MEDIAN(df_2.d)</th>\n",
       "      <th>SUM(df_2.c) + SUM(df_2.d)</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>5.0</td>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>6.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>...</td>\n",
       "      <td>6.0</td>\n",
       "      <td>6</td>\n",
       "      <td>2</td>\n",
       "      <td>7.0</td>\n",
       "      <td>7</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>6.0</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>13</td>\n",
       "      <td>3</td>\n",
       "      <td>6.5</td>\n",
       "      <td>6</td>\n",
       "      <td>19</td>\n",
       "      <td>9.5</td>\n",
       "      <td>9.5</td>\n",
       "      <td>...</td>\n",
       "      <td>12.5</td>\n",
       "      <td>15</td>\n",
       "      <td>8</td>\n",
       "      <td>10.5</td>\n",
       "      <td>17</td>\n",
       "      <td>7</td>\n",
       "      <td>10</td>\n",
       "      <td>5</td>\n",
       "      <td>8.5</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>9</td>\n",
       "      <td>2</td>\n",
       "      <td>9.0</td>\n",
       "      <td>6</td>\n",
       "      <td>11</td>\n",
       "      <td>11.0</td>\n",
       "      <td>11.0</td>\n",
       "      <td>...</td>\n",
       "      <td>11.0</td>\n",
       "      <td>11</td>\n",
       "      <td>4</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13</td>\n",
       "      <td>6</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>11.0</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>5</td>\n",
       "      <td>8</td>\n",
       "      <td>5</td>\n",
       "      <td>8.0</td>\n",
       "      <td>8</td>\n",
       "      <td>13</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13.0</td>\n",
       "      <td>...</td>\n",
       "      <td>13.0</td>\n",
       "      <td>11</td>\n",
       "      <td>8</td>\n",
       "      <td>13.0</td>\n",
       "      <td>13</td>\n",
       "      <td>10</td>\n",
       "      <td>10</td>\n",
       "      <td>8</td>\n",
       "      <td>11.0</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    a  b  SUM(df_2.c)  SUM(df_2.d)  MEDIAN(df_2.c)  MEDIAN(df_2.d)  a + b  \\\n",
       "id                                                                          \n",
       "0   1  2            1            5               1             5.0      3   \n",
       "1   2  4            6           13               3             6.5      6   \n",
       "2   2  4            2            9               2             9.0      6   \n",
       "3   3  5            5            8               5             8.0      8   \n",
       "\n",
       "    SUM(df_2.c + d)  MEDIAN(df_2.c + d)  MEDIAN(df_2.c) + MEDIAN(df_2.d)  ...  \\\n",
       "id                                                                        ...   \n",
       "0                 6                 6.0                              6.0  ...   \n",
       "1                19                 9.5                              9.5  ...   \n",
       "2                11                11.0                             11.0  ...   \n",
       "3                13                13.0                             13.0  ...   \n",
       "\n",
       "    MEDIAN(df_2.d) + SUM(df_2.c)  a + SUM(df_2.d)  a + SUM(df_2.c)  \\\n",
       "id                                                                   \n",
       "0                            6.0                6                2   \n",
       "1                           12.5               15                8   \n",
       "2                           11.0               11                4   \n",
       "3                           13.0               11                8   \n",
       "\n",
       "    b + MEDIAN(df_2.d)  b + SUM(df_2.d)  b + MEDIAN(df_2.c)  b + SUM(df_2.c)  \\\n",
       "id                                                                             \n",
       "0                  7.0                7                   3                3   \n",
       "1                 10.5               17                   7               10   \n",
       "2                 13.0               13                   6                6   \n",
       "3                 13.0               13                  10               10   \n",
       "\n",
       "    a + MEDIAN(df_2.c)  a + MEDIAN(df_2.d)  SUM(df_2.c) + SUM(df_2.d)  \n",
       "id                                                                     \n",
       "0                    2                 6.0                          6  \n",
       "1                    5                 8.5                         19  \n",
       "2                    4                11.0                         11  \n",
       "3                    8                11.0                         13  \n",
       "\n",
       "[4 rows x 23 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
